import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go # visualization
import plotly.offline as py # visualization
from dateutil import parser
# Import appdata.csv file. I have used the dataframe as df but you can choose whatever name
# but you just have to careful with the code
df = pd.read_csv("appdata.csv")
# Use the info, describe and head function to check the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user 50000 non-null int64 1 first_open 50000 non-null object 2 dayofweek 50000 non-null int64 3 hour 50000 non-null object 4 age 50000 non-null int64 5 screen_list 50000 non-null object 6 numscreens 50000 non-null int64 7 minigame 50000 non-null int64 8 used_premium_feature 50000 non-null int64 9 enrolled 50000 non-null int64 10 enrolled_date 31074 non-null object 11 liked 50000 non-null int64 dtypes: int64(8), object(4) memory usage: 4.6+ MB
df.describe()
| user | dayofweek | age | numscreens | minigame | used_premium_feature | enrolled | liked | |
|---|---|---|---|---|---|---|---|---|
| count | 50000.000000 | 50000.000000 | 50000.00000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
| mean | 186889.729900 | 3.029860 | 31.72436 | 21.095900 | 0.107820 | 0.172020 | 0.621480 | 0.165000 |
| std | 107768.520361 | 2.031997 | 10.80331 | 15.728812 | 0.310156 | 0.377402 | 0.485023 | 0.371184 |
| min | 13.000000 | 0.000000 | 16.00000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 93526.750000 | 1.000000 | 24.00000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 187193.500000 | 3.000000 | 29.00000 | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 75% | 279984.250000 | 5.000000 | 37.00000 | 28.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 373662.000000 | 6.000000 | 101.00000 | 325.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
df.head()
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 2012-12-27 02:14:51.273 | 3 | 02:00:00 | 23 | idscreen,joinscreen,Cycle,product_review,ScanP... | 15 | 0 | 0 | 0 | NaN | 0 |
| 1 | 333588 | 2012-12-02 01:16:00.905 | 6 | 01:00:00 | 24 | joinscreen,product_review,product_review2,Scan... | 13 | 0 | 0 | 0 | NaN | 0 |
| 2 | 254414 | 2013-03-19 19:19:09.157 | 1 | 19:00:00 | 23 | Splash,Cycle,Loan | 3 | 0 | 1 | 0 | NaN | 1 |
| 3 | 234192 | 2013-07-05 16:08:46.354 | 4 | 16:00:00 | 28 | product_review,Home,product_review,Loan3,Finan... | 40 | 0 | 0 | 1 | 2013-07-05 16:11:49.513 | 0 |
| 4 | 51549 | 2013-02-26 18:50:48.661 | 1 | 18:00:00 | 31 | idscreen,joinscreen,Cycle,Credit3Container,Sca... | 32 | 0 | 0 | 1 | 2013-02-26 18:56:37.841 | 1 |
# Write code to check for all the missing values
df.isnull().sum()
user 0 first_open 0 dayofweek 0 hour 0 age 0 screen_list 0 numscreens 0 minigame 0 used_premium_feature 0 enrolled 0 enrolled_date 18926 liked 0 dtype: int64
# Change hour variable to integer using the slice function and update the dataframe
df['hour'] = df.hour.str.slice(1,3).astype(int)
# Enrolled is the variable of interest or the dependent variable and user is not needed at this time, screen_list,
# enrolled date and first open are object types
# Remove these columns and create a new dataframe of just the numeric variables and name it whatever you want
df1 = df.drop(['enrolled','user','first_open','enrolled_date','screen_list'],axis=1)
# Check the new data frame using the head function
df1.head()
| dayofweek | hour | age | numscreens | minigame | used_premium_feature | liked | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 2 | 23 | 15 | 0 | 0 | 0 |
| 1 | 6 | 1 | 24 | 13 | 0 | 0 | 0 |
| 2 | 1 | 19 | 23 | 3 | 0 | 1 | 1 |
| 3 | 4 | 16 | 28 | 40 | 0 | 0 | 0 |
| 4 | 1 | 18 | 31 | 32 | 0 | 0 | 1 |
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dayofweek 50000 non-null int64 1 hour 50000 non-null int32 2 age 50000 non-null int64 3 numscreens 50000 non-null int64 4 minigame 50000 non-null int64 5 used_premium_feature 50000 non-null int64 6 liked 50000 non-null int64 dtypes: int32(1), int64(6) memory usage: 2.5 MB
# The simple way
for i in df1.columns:
df1.hist(i)
# The complicated way or the nice way
# use the code from the first project tweak it a little bit and create pretty looking histograms
# Function for histogram for customer attrition types
def histogram(column) :
trace = go.Histogram(x = df1[column],
name = "Customer Distribution",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = trace
layout = go.Layout(dict(title =column + " vs customer distribution",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "distribution",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
# For all categorical columns plot histogram
for i in df1.columns :
histogram(i)
plt.show()
# Correlation Plot with the Response variable
df1.corrwith(df.enrolled).plot.bar(figsize = (20, 10), title = 'Correlation with Response Variable',
fontsize = 15, rot = 45, grid = True)
<AxesSubplot:title={'center':'Correlation with Response Variable'}>
# Set Background
sns.set(style = 'white', font_scale =2)
# Compute the matrix
corr = df1.corr()
# Generate the mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
C:\Users\aashu\AppData\Local\Temp/ipykernel_21112/3650361391.py:11: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
mask
array([[ True, True, True, True, True, True, True],
[False, True, True, True, True, True, True],
[False, False, True, True, True, True, True],
[False, False, False, True, True, True, True],
[False, False, False, False, True, True, True],
[False, False, False, False, False, True, True],
[False, False, False, False, False, False, True]])
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 15))
f.suptitle("Correlation Matrix", fontsize = 40)
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<AxesSubplot:>
# Create a heat map of correlation matrix by using the same code from the first project and tweaking it
# Plotting
matrix_cols = corr.columns.tolist()
corr_array = np.array(corr)
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale = "RdBu",
colorbar = dict(title = "Pearson Correlation coefficient",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for variables",
autosize = False,
height = 720,
width = 800,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9))
)
)
data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
From the heat map we can see that there is almost no correlation between the fields. Among all the fields, there is slight correlation between used_premium_feature and minigame (0.108 as correlation coefficient). Where as there is slight negative correaltion between age and numscreens (-0.128 as correlation coeffient).
# Lets first look at the target variable (enrolled)
df['enrolled'].head()
0 0 1 0 2 0 3 1 4 1 Name: enrolled, dtype: int64
df.dtypes
user int64 first_open object dayofweek int64 hour int32 age int64 screen_list object numscreens int64 minigame int64 used_premium_feature int64 enrolled int64 enrolled_date object liked int64 dtype: object
# Convert date specific variables to datetime format (enrolled date, first_open)
df['enrolled_date'] = pd.to_datetime(df['enrolled_date'])
df['first_open'] = pd.to_datetime(df['first_open'])
# Check for the datatypes
df.dtypes
user int64 first_open datetime64[ns] dayofweek int64 hour int32 age int64 screen_list object numscreens int64 minigame int64 used_premium_feature int64 enrolled int64 enrolled_date datetime64[ns] liked int64 dtype: object
# Calculate the difference in the two dates to give the result in hours
df['difference'] = df['enrolled_date'] - df['first_open']
df['difference'] = df['difference'].astype('timedelta64[h]')
df[df['difference'].isna() == True].head(50)
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 2012-12-27 02:14:51.273 | 3 | 2 | 23 | idscreen,joinscreen,Cycle,product_review,ScanP... | 15 | 0 | 0 | 0 | NaT | 0 | NaN |
| 1 | 333588 | 2012-12-02 01:16:00.905 | 6 | 1 | 24 | joinscreen,product_review,product_review2,Scan... | 13 | 0 | 0 | 0 | NaT | 0 | NaN |
| 2 | 254414 | 2013-03-19 19:19:09.157 | 1 | 19 | 23 | Splash,Cycle,Loan | 3 | 0 | 1 | 0 | NaT | 1 | NaN |
| 6 | 144649 | 2012-12-25 02:33:18.461 | 1 | 2 | 35 | product_review,product_review2,ScanPreview | 3 | 0 | 0 | 0 | NaT | 0 | NaN |
| 7 | 249366 | 2012-12-11 03:07:49.875 | 1 | 3 | 26 | Splash,Cycle,Home,Credit3Container,Credit3Dash... | 41 | 0 | 1 | 0 | NaT | 0 | NaN |
| 11 | 317454 | 2013-05-28 11:07:07.358 | 1 | 11 | 32 | product_review,Home,Loan2,Credit3Container,Ver... | 25 | 1 | 1 | 0 | NaT | 0 | NaN |
| 12 | 205375 | 2012-12-17 06:28:45.903 | 0 | 6 | 25 | idscreen,joinscreen,Cycle,product_review,produ... | 11 | 0 | 0 | 0 | NaT | 0 | NaN |
| 14 | 359855 | 2013-02-18 04:48:48.912 | 0 | 4 | 17 | joinscreen,product_review,product_review2,Scan... | 9 | 0 | 0 | 0 | NaT | 0 | NaN |
| 21 | 236951 | 2013-04-20 04:02:18.337 | 5 | 4 | 38 | Cycle,Splash,Home,Loan2,product_review,product... | 42 | 0 | 1 | 0 | NaT | 0 | NaN |
| 29 | 210060 | 2013-04-27 17:41:24.374 | 5 | 17 | 20 | idscreen,Cycle,Home,Loan2,product_review,produ... | 10 | 0 | 0 | 0 | NaT | 0 | NaN |
| 36 | 310250 | 2013-05-04 22:50:49.276 | 5 | 22 | 36 | Home,Loan2,Loan1,Institutions,Credit3Container... | 47 | 0 | 0 | 0 | NaT | 0 | NaN |
| 41 | 342983 | 2013-07-02 08:21:46.166 | 1 | 8 | 23 | idscreen,Home,Loan3,Finances,Credit3,Settings,... | 13 | 0 | 1 | 0 | NaT | 0 | NaN |
| 42 | 62638 | 2012-12-25 08:15:00.295 | 1 | 8 | 25 | Credit3Container,ScanPreview,location,VerifyMo... | 24 | 0 | 0 | 0 | NaT | 0 | NaN |
| 45 | 67588 | 2013-06-18 02:23:09.487 | 1 | 2 | 30 | product_review,Home,Loan3,Credit3,Settings,Lis... | 31 | 0 | 0 | 0 | NaT | 0 | NaN |
| 46 | 187179 | 2013-05-10 14:38:25.128 | 4 | 14 | 58 | idscreen,Cycle,Splash,Home,product_review,prod... | 45 | 0 | 0 | 0 | NaT | 0 | NaN |
| 49 | 298577 | 2013-01-11 20:26:09.967 | 4 | 20 | 24 | Home,Loan2,Institutions,SelectInstitution,Bank... | 39 | 0 | 1 | 0 | NaT | 0 | NaN |
| 53 | 246710 | 2013-02-18 02:33:09.868 | 0 | 2 | 21 | joinscreen,Cycle,product_review,product_review... | 10 | 0 | 0 | 0 | NaT | 0 | NaN |
| 55 | 335625 | 2013-02-19 04:57:23.449 | 1 | 4 | 22 | joinscreen,Cycle,product_review,product_review... | 19 | 0 | 0 | 0 | NaT | 0 | NaN |
| 56 | 91566 | 2013-03-25 15:13:51.038 | 0 | 15 | 23 | Splash,Home,Loan2,Institutions,GroupedInstitut... | 39 | 0 | 0 | 0 | NaT | 0 | NaN |
| 59 | 117488 | 2013-04-22 20:39:54.297 | 0 | 20 | 24 | Cycle,Splash,Home,product_review,Loan2,Welcome... | 23 | 0 | 0 | 0 | NaT | 1 | NaN |
| 62 | 366999 | 2013-04-12 17:46:47.303 | 4 | 17 | 48 | idscreen,Home,VerifyPhone,Credit1,Credit3Dashb... | 12 | 1 | 0 | 0 | NaT | 0 | NaN |
| 64 | 20747 | 2012-12-09 23:24:51.903 | 6 | 23 | 31 | Loan2,product_review2,ScanPreview,BankVerifica... | 17 | 0 | 0 | 0 | NaT | 0 | NaN |
| 65 | 354498 | 2013-04-28 19:04:21.198 | 6 | 19 | 24 | Splash,MLWebView,product_review,product_review... | 16 | 1 | 1 | 0 | NaT | 0 | NaN |
| 66 | 305707 | 2013-01-30 13:27:47.728 | 2 | 13 | 19 | Cycle,product_review,VerifyCountry,VerifyToken... | 7 | 0 | 0 | 0 | NaT | 0 | NaN |
| 70 | 177834 | 2013-06-02 23:38:32.884 | 6 | 23 | 34 | Home,product_review,ScanPreview,product_review... | 7 | 0 | 0 | 0 | NaT | 0 | NaN |
| 71 | 235616 | 2013-04-19 12:46:13.675 | 4 | 12 | 38 | idscreen,Cycle,Home,Loan2,product_review,produ... | 14 | 0 | 0 | 0 | NaT | 1 | NaN |
| 77 | 274093 | 2013-06-09 16:24:13.664 | 6 | 16 | 28 | Loan3,Loan1,WebView,Finances,Credit3,product_r... | 32 | 0 | 0 | 0 | NaT | 0 | NaN |
| 78 | 139567 | 2013-05-16 03:46:36.147 | 3 | 3 | 22 | Splash,Loan2,product_review,product_review,pro... | 11 | 0 | 0 | 0 | NaT | 0 | NaN |
| 88 | 306457 | 2013-06-16 23:10:45.587 | 6 | 23 | 17 | product_review,product_review,product_review,p... | 5 | 0 | 0 | 0 | NaT | 1 | NaN |
| 91 | 232734 | 2012-12-09 08:23:39.249 | 6 | 8 | 27 | Home,VerifyPhone,ScanPreview,Rewardjoinscreen,... | 8 | 0 | 1 | 0 | NaT | 1 | NaN |
| 95 | 86325 | 2013-05-28 21:52:11.831 | 1 | 21 | 56 | product_review3,location,VerifyCountry,VerifyP... | 13 | 1 | 0 | 0 | NaT | 0 | NaN |
| 97 | 272545 | 2013-04-03 18:54:53.679 | 2 | 18 | 42 | Splash,Home,Loan2,Institutions,AccountView,Loa... | 21 | 0 | 1 | 0 | NaT | 0 | NaN |
| 98 | 93890 | 2013-05-05 19:35:19.920 | 6 | 19 | 24 | idscreen,product_review,ScanPreview,VerifyDate... | 7 | 0 | 0 | 0 | NaT | 0 | NaN |
| 103 | 13148 | 2013-06-14 22:03:41.237 | 4 | 22 | 37 | Home,Loan3,Loan1,WebView | 4 | 0 | 0 | 0 | NaT | 0 | NaN |
| 104 | 140258 | 2013-01-01 10:37:30.571 | 1 | 10 | 27 | Cycle,Credit3Dashboard,Credit3Container,Loan2,... | 34 | 0 | 0 | 0 | NaT | 0 | NaN |
| 107 | 54898 | 2013-04-07 11:36:16.418 | 6 | 11 | 20 | VerifyDateOfBirth | 1 | 1 | 0 | 0 | NaT | 0 | NaN |
| 115 | 78196 | 2013-06-29 19:51:47.752 | 5 | 19 | 26 | Leaderboard,Home,ReferralContainer,product_rev... | 29 | 1 | 0 | 0 | NaT | 0 | NaN |
| 116 | 317630 | 2012-12-21 21:24:31.172 | 4 | 21 | 24 | joinscreen,Cycle,Credit1 | 3 | 0 | 0 | 0 | NaT | 0 | NaN |
| 117 | 96320 | 2013-07-05 19:39:23.703 | 4 | 19 | 17 | Home,Loan3,product_review,location | 4 | 0 | 0 | 0 | NaT | 0 | NaN |
| 118 | 3052 | 2012-12-29 23:49:33.817 | 5 | 23 | 41 | Splash,Cycle,Home,Loan2,Loan1,MLWebView,Histor... | 38 | 0 | 0 | 0 | NaT | 0 | NaN |
| 129 | 316136 | 2013-05-17 15:24:26.158 | 4 | 15 | 37 | Home,product_review,ScanPreview,VerifyDateOfBi... | 11 | 0 | 0 | 0 | NaT | 0 | NaN |
| 132 | 94763 | 2013-05-21 16:56:52.003 | 1 | 16 | 47 | Splash,Home,ReferralContainer,Loan2,Institutio... | 8 | 0 | 0 | 0 | NaT | 1 | NaN |
| 136 | 165008 | 2013-01-03 18:39:05.100 | 3 | 18 | 25 | joinscreen,Cycle,product_review,product_review... | 43 | 0 | 0 | 0 | NaT | 0 | NaN |
| 138 | 276264 | 2013-02-22 07:15:23.922 | 4 | 7 | 18 | joinscreen,product_review,product_review2,Scan... | 20 | 0 | 0 | 0 | NaT | 0 | NaN |
| 144 | 138529 | 2013-06-18 18:14:42.181 | 1 | 18 | 45 | Home,Loan3,product_review,product_review,produ... | 22 | 1 | 0 | 0 | NaT | 0 | NaN |
| 145 | 339263 | 2013-03-07 19:08:59.669 | 3 | 19 | 28 | Splash,Cycle,Home,Institutions,SelectInstituti... | 12 | 0 | 0 | 0 | NaT | 0 | NaN |
| 153 | 347837 | 2013-01-09 07:54:34.356 | 2 | 7 | 28 | Cycle,Home,Loan2,Loan1,MLWebView,Settings,Prof... | 40 | 0 | 0 | 0 | NaT | 0 | NaN |
| 158 | 155344 | 2013-05-24 22:42:13.825 | 4 | 22 | 44 | idscreen,Cycle,Splash,Home,ProfilePage,Rewards... | 56 | 1 | 1 | 0 | NaT | 0 | NaN |
| 160 | 197551 | 2013-02-01 09:06:43.309 | 4 | 9 | 22 | SignupName,Signup,idscreen,joinscreen,Cycle,Ho... | 33 | 0 | 0 | 0 | NaT | 1 | NaN |
| 161 | 178798 | 2013-06-30 02:19:02.944 | 6 | 2 | 49 | idscreen,Home,Loan3,Finances,Loan1,History,Cre... | 12 | 0 | 1 | 0 | NaT | 0 | NaN |
# Plot histogram of the difference variable
plt.hist(df['difference'].dropna(), color ='#3F537D',range =[0, 48])
plt.title('Distribution of time since enrolled')
plt.show()
We have a lot of users data, in which we do not have their exact enrolled date. we can not get the time difference between enrolled date and first open date for these users. Therefore we are dropping the values in which we do not have enrolled date.
We have used the range function for subseting our dataset. The histogram has been plotted only for those observations in which 'difference' feature has value between 0 and 48.
It can be clearly visualized that mostly users enroll within 5 hours of opening the app.
df[df.difference > 48].head(10)
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 372004 | 2013-03-20 14:22:01.569 | 2 | 14 | 29 | product_review,product_review2,ScanPreview,Ver... | 33 | 1 | 1 | 1 | 2013-04-27 22:24:54.542 | 0 | 920.0 |
| 13 | 307608 | 2013-05-25 19:52:31.798 | 5 | 19 | 23 | Alerts,ProfilePage,Home,Credit3Container | 4 | 0 | 0 | 1 | 2013-06-18 14:27:42.824 | 0 | 570.0 |
| 15 | 284938 | 2013-02-02 18:41:35.724 | 5 | 18 | 25 | idscreen,joinscreen,Cycle,Loan2,product_review... | 26 | 1 | 0 | 1 | 2013-04-29 21:10:04.466 | 0 | 2066.0 |
| 17 | 141402 | 2013-02-02 21:12:46.888 | 5 | 21 | 55 | joinscreen,Cycle,product_review,Loan2,product_... | 20 | 0 | 0 | 1 | 2013-02-11 01:35:03.098 | 0 | 196.0 |
| 27 | 85089 | 2013-01-23 01:43:05.398 | 2 | 1 | 35 | idscreen,joinscreen,Home,Loan2,ProfilePage,Pro... | 35 | 0 | 0 | 1 | 2013-02-24 01:21:26.698 | 0 | 767.0 |
| 50 | 357863 | 2013-05-25 08:52:07.270 | 5 | 8 | 26 | Splash,idscreen,Cycle,product_review,Home,Loan... | 17 | 0 | 0 | 1 | 2013-05-28 04:49:43.840 | 0 | 67.0 |
| 58 | 44814 | 2013-02-05 02:08:23.702 | 1 | 2 | 31 | idscreen,joinscreen,Credit3Container,ScanPrevi... | 27 | 0 | 0 | 1 | 2013-04-08 18:16:42.370 | 0 | 1504.0 |
| 73 | 149499 | 2013-01-21 01:40:09.997 | 0 | 1 | 22 | idscreen,joinscreen,Cycle,product_review,Loan2... | 8 | 0 | 0 | 1 | 2013-02-01 18:27:33.664 | 0 | 280.0 |
| 85 | 329480 | 2013-04-24 19:02:47.489 | 2 | 19 | 25 | product_review,product_review,Loan2,Institutio... | 10 | 0 | 0 | 1 | 2013-05-03 06:11:40.051 | 0 | 203.0 |
| 86 | 60702 | 2013-03-04 05:46:38.360 | 0 | 5 | 31 | joinscreen,Cycle,Credit3Container,ScanPreview,... | 41 | 0 | 0 | 1 | 2013-03-06 23:14:47.807 | 0 | 65.0 |
df.loc[df.difference > 48, 'enrolled'] = 0
we have assigned 0 to the enrolled feature, only for those observations where the 'difference' column has value greater than 48.
# Drop the difference, enrolled date and first open columns from the dataframe
df = df.drop(['difference','first_open','enrolled_date'],axis=1)
# Import the most popular screens (screens that the user visited in the first 24 hr. period)
# only one column so convert to an array
top_screens = pd.read_csv("popular_screens.csv").top_screens.values
top_screens
array(['Loan2', 'location', 'Institutions', 'Credit3Container',
'VerifyPhone', 'BankVerification', 'VerifyDateOfBirth',
'ProfilePage', 'VerifyCountry', 'Cycle', 'idscreen',
'Credit3Dashboard', 'Loan3', 'CC1Category', 'Splash', 'Loan',
'CC1', 'RewardsContainer', 'Credit3', 'Credit1', 'EditProfile',
'Credit2', 'Finances', 'CC3', 'Saving9', 'Saving1', 'Alerts',
'Saving8', 'Saving10', 'Leaderboard', 'Saving4', 'VerifyMobile',
'VerifyHousing', 'RewardDetail', 'VerifyHousingAmount',
'ProfileMaritalStatus', 'ProfileChildren ', 'ProfileEducation',
'Saving7', 'ProfileEducationMajor', 'Rewards', 'AccountView',
'VerifyAnnualIncome', 'VerifyIncomeType', 'Saving2', 'Saving6',
'Saving2Amount', 'Saving5', 'ProfileJobTitle', 'Login',
'ProfileEmploymentLength', 'WebView', 'SecurityModal', 'Loan4',
'ResendToken', 'TransactionList', 'NetworkFailure', 'ListPicker'],
dtype=object)
# Map screens to fields. We are adding a , so that we can count the screens because there is always going to be 1 less
# for all the screen names
df['screen_list'] = df.screen_list.astype(str) + ','
df['screen_list']
0 idscreen,joinscreen,Cycle,product_review,ScanP...
1 joinscreen,product_review,product_review2,Scan...
2 Splash,Cycle,Loan,
3 product_review,Home,product_review,Loan3,Finan...
4 idscreen,joinscreen,Cycle,Credit3Container,Sca...
...
49995 Splash,Home,ScanPreview,VerifyPhone,VerifySSN,...
49996 Cycle,Splash,Home,RewardsContainer,
49997 joinscreen,product_review,product_review2,Scan...
49998 Cycle,Home,product_review,product_review,produ...
49999 product_review,ScanPreview,VerifyDateOfBirth,V...
Name: screen_list, Length: 50000, dtype: object
for sc in top_screens:
df[sc] = df.screen_list.str.contains(sc).astype(int)
df['screen_list'] = df.screen_list.str.replace(sc + ",", "")
df['other'] = df.screen_list.str.count(",")
df = df.drop(columns=['screen_list'])
df.head()
| user | dayofweek | hour | age | numscreens | minigame | used_premium_feature | enrolled | liked | Loan2 | ... | Login | ProfileEmploymentLength | WebView | SecurityModal | Loan4 | ResendToken | TransactionList | NetworkFailure | ListPicker | other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 3 | 2 | 23 | 15 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
| 1 | 333588 | 6 | 1 | 24 | 13 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 2 | 254414 | 1 | 19 | 23 | 3 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 234192 | 4 | 16 | 28 | 40 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
| 4 | 51549 | 1 | 18 | 31 | 32 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 |
5 rows × 68 columns
# Funnels
# Information is provided to you
savings_screens = ["Saving1",
"Saving2",
"Saving2Amount",
"Saving4",
"Saving5",
"Saving6",
"Saving7",
"Saving8",
"Saving9",
"Saving10"]
df["SavingCount"] = df[savings_screens].sum(axis=1)
df = df.drop(columns=savings_screens)
# Credit monitoring screens
cm_screens = ["Credit1",
"Credit2",
"Credit3",
"Credit3Container",
"Credit3Dashboard"]
df["CMCount"] = df[cm_screens].sum(axis=1)
df = df.drop(columns=cm_screens)
# Credit card screens
cc_screens = ["CC1",
"CC1Category",
"CC3"]
df["CCCount"] = df[cc_screens].sum(axis=1)
df = df.drop(columns=cc_screens)
# Loan screens
loan_screens = ["Loan",
"Loan2",
"Loan3",
"Loan4"]
df["LoansCount"] = df[loan_screens].sum(axis=1)
df = df.drop(columns=loan_screens)
df.dtypes
user int64 dayofweek int64 hour int32 age int64 numscreens int64 minigame int64 used_premium_feature int64 enrolled int64 liked int64 location int32 Institutions int32 VerifyPhone int32 BankVerification int32 VerifyDateOfBirth int32 ProfilePage int32 VerifyCountry int32 Cycle int32 idscreen int32 Splash int32 RewardsContainer int32 EditProfile int32 Finances int32 Alerts int32 Leaderboard int32 VerifyMobile int32 VerifyHousing int32 RewardDetail int32 VerifyHousingAmount int32 ProfileMaritalStatus int32 ProfileChildren int32 ProfileEducation int32 ProfileEducationMajor int32 Rewards int32 AccountView int32 VerifyAnnualIncome int32 VerifyIncomeType int32 ProfileJobTitle int32 Login int32 ProfileEmploymentLength int32 WebView int32 SecurityModal int32 ResendToken int32 TransactionList int32 NetworkFailure int32 ListPicker int32 other int64 SavingCount int64 CMCount int64 CCCount int64 LoansCount int64 dtype: object
df.head()
| user | dayofweek | hour | age | numscreens | minigame | used_premium_feature | enrolled | liked | location | ... | SecurityModal | ResendToken | TransactionList | NetworkFailure | ListPicker | other | SavingCount | CMCount | CCCount | LoansCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 3 | 2 | 23 | 15 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 1 |
| 1 | 333588 | 6 | 1 | 24 | 13 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 1 |
| 2 | 254414 | 1 | 19 | 23 | 3 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 234192 | 4 | 16 | 28 | 40 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 3 | 0 | 1 |
| 4 | 51549 | 1 | 18 | 31 | 32 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 2 | 0 | 1 |
5 rows × 50 columns
df.to_csv('new_appdata.csv', index = False) # Save the cleaned file
# Import the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go #visualization
import plotly.offline as py #visualization
from dateutil import parser
import time
import warnings
warnings.filterwarnings("ignore")
# Read the saved file
df = pd.read_csv("new_appdata.csv")
# Store it in another variable
X = df
# Perform the head function
X.head()
| user | dayofweek | hour | age | numscreens | minigame | used_premium_feature | enrolled | liked | location | ... | SecurityModal | ResendToken | TransactionList | NetworkFailure | ListPicker | other | SavingCount | CMCount | CCCount | LoansCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 3 | 2 | 23 | 15 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 1 |
| 1 | 333588 | 6 | 1 | 24 | 13 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 1 |
| 2 | 254414 | 1 | 19 | 23 | 3 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 234192 | 4 | 16 | 28 | 40 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 3 | 0 | 1 |
| 4 | 51549 | 1 | 18 | 31 | 32 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 2 | 0 | 1 |
5 rows × 50 columns
# Create the response variable
Y = X['enrolled']
# Drop it from the predictor variables
X = X.drop(['enrolled'],axis=1)
# Import the train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
# Keep user ID for associating the user ID to its prediction (Save before you remove it)
train_ID = X_train['user']
# Drop it from the train set
X_train = X_train.drop(columns = 'user')
# Keep user ID for associating the user ID to its prediction from the test set
# Drop it from the test set
test_ID = X_test['user']
X_test = X_test.drop(columns='user')
# import standard scalar function
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler() # returns numpy array but looses the column names and index
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
# Convert the original into scaled set as all numerical features have been normalized
sc = StandardScaler()
sc.fit_transform(df)
sc.transform(df)
array([[ 0.44768874, -0.01469505, -1.41936322, ..., -0.76202179,
-0.28861861, 0.31234541],
[ 1.3612485 , 1.4617 , -1.55380801, ..., -0.76202179,
-0.28861861, 0.31234541],
[ 0.62657393, -0.99895842, 0.8661982 , ..., -0.76202179,
-0.28861861, 0.31234541],
...,
[ 1.07154134, -0.50682673, 1.26953257, ..., -0.76202179,
-0.28861861, -1.16376713],
[ 1.28067686, 1.4617 , -0.07491533, ..., -0.76202179,
-0.28861861, -1.16376713],
[-1.48321911, 0.47743663, -1.55380801, ..., -0.76202179,
-0.28861861, 0.31234541]])
# Import the appropriate model we will use LogisticRegression
from sklearn.linear_model import LogisticRegression
# When you have a large number of features in your dataset, some of the Regularization techniques used to
# address over-fitting and feature selection are L1 (Lasso) and L2 (Ridge)
# We use them to avoid over-fitting
classifier = LogisticRegression(penalty='l2',random_state=0)
# Fitting the model to the training set
# screens can be correlated to each other and we created the funnel features
# other correlations may exist between screens. l2 regularization penalizes any particular field as penalty
# that is strongly correlated to the response variable
lrf = classifier.fit(X_train,y_train)
# Predicting the test set
y_pred = classifier.predict(X_test)
# Evaluating the results and validate accuracy
# Buld the Confusion Matrix by importing the correct libraries
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from sklearn import metrics
cm = confusion_matrix(y_test, y_pred)
# Create a Confusion matrix visually
cm
array([[3898, 1174],
[1139, 3789]], dtype=int64)
# Print the accuracy_score, precision_score, recall_score, f1_score
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
0.7687 0.7634495264960709 0.7688717532467533 0.7661510464058235
# K-fold Cross validation to different subsets of the training tests model works on every subset with 10 cv
from sklearn.model_selection import cross_val_score
accuracies = np.mean(cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10))
print(accuracies)
0.76505
Yes, the model holds true. Our model is giving 76% accuracy which is quite good. We have checked this accuracy by confusion matrix and cross_val_score.
# Formatting the final results
final_results = pd.concat([y_test, test_ID], axis = 1).dropna()
final_results['predicted_results'] = y_pred
# Reorder the columns so the final results look like this
# (user, enrolled, predicted results)
final_results = final_results[["user","enrolled","predicted_results"]]
final_results
| user | enrolled | predicted_results | |
|---|---|---|---|
| 11841 | 239786 | 1 | 1 |
| 19602 | 279644 | 1 | 1 |
| 45519 | 98290 | 0 | 0 |
| 25747 | 170150 | 1 | 1 |
| 42642 | 237568 | 1 | 1 |
| ... | ... | ... | ... |
| 25091 | 143036 | 1 | 0 |
| 27853 | 91158 | 1 | 1 |
| 47278 | 248318 | 0 | 0 |
| 37020 | 142418 | 1 | 1 |
| 2217 | 279355 | 1 | 0 |
10000 rows × 3 columns
This model has provided us with a very important column called predicted_results. This column will help the company understand which customer is going to stay a paid customer and which customer is going to leave which will help them understand how to vary their marketing strategies.
We can see that currently the financial situation for this app is very bleak and it needs to improve for the app to merely stay in the game. In order to do that several effective promotion and marketing strategies need to be employed in terms of segmentation of the age groups, as well as improving model accuracy to predict the status of the customers in the long term.